Correlation between functional selections on humanDAG1 and mastomysDAG1 cells¶

Compare functional scores measured on cells expressing humanDAG1 and mastomysDAG1

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import altair as alt

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

File paths for data:

In [2]:
# this cell is tagged as `parameters` for papermill parameterization
HEK293T_data_path = None
humanDAG1_data_path = None
mastomysDAG1_data_path = None
# Minimum times seen filter
MTS = None 
n_selections = None

html_dir = None
html_output = None
In [3]:
# Parameters
HEK293T_data_path = "results/func_effects/averages/293T_entry_func_effects.csv"
humanDAG1_data_path = "results/func_effects/averages/human_293T_entry_func_effects.csv"
mastomysDAG1_data_path = (
    "results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
)
MTS = 2
n_selections = 8
html_dir = "results/DAG1_ortholog_correlations/"
html_output = "results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [4]:
# # Uncomment for running interactive
# HEK293T_data_path = "../results/func_effects/averages/293T_entry_func_effects.csv"
# humanDAG1_data_path = "../results/func_effects/averages/human_293T_entry_func_effects.csv"
# mastomysDAG1_data_path = "../results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
# # Minimum times seen filter
# MTS = 2
# n_selections = 8

# html_dir = "../results/DAG1_ortholog_correlations/"
# html_output = "../results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [5]:
# Read data
hek_df = pd.read_csv(HEK293T_data_path)
human_df = pd.read_csv(humanDAG1_data_path)
mastomys_df = pd.read_csv(mastomysDAG1_data_path)

# Merge data on intersection of measured values
merged_df = (
    human_df.merge(
        mastomys_df,
        how="inner",
        on=["site", "wildtype", "mutant"],
        suffixes=["_human", "_mastomys"],
        validate="one_to_one",
    )
    .merge(
        hek_df,
        how="inner",
        on=["site", "wildtype", "mutant"],
        validate="one_to_one",
    )
)
merged_df = merged_df.rename(columns={
    "effect" : "effect_HEK293T", 
    "times_seen" : "times_seen_HEK293T", 
    "n_selections" : "n_selections_HEK293T", 
})

# Add average times seen column
merged_df["average_times_seen"] = merged_df[["times_seen_human", "times_seen_mastomys", "times_seen_HEK293T"]].mean(axis=1)

# Filter for number of selections
merged_df = (
    merged_df.loc[
        (merged_df["n_selections_human"] == n_selections)
        &
        (merged_df["n_selections_mastomys"] == n_selections)
        &
        (merged_df["n_selections_HEK293T"] == n_selections)
    ]
)

Plot correlation of scores with an interactive plot

In [6]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")

slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)

# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="effect on cell entry measured in 293T cells", 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T"
    ],
).properties(
    width=300,
    height=300
)

mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_mastomys",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="effect on cell entry measured in 293T cells", 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T",
    ],
).properties(
    width=300,
    height=300
)

mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    alt.Y(
        "effect_mastomys",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys"
    ],
).properties(
    width=300,
    height=300,
)

corr_chart = alt.hconcat(
    human_vs_hek,
    mastomys_vs_hek,
    mastomys_vs_human, 
    spacing=5,
    title="Correlations of functional selections for DAG1 orthologs",
).add_params(
   selector
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(html_dir):
    os.mkdir(html_dir)

print(f"Saving to {html_output}")
corr_chart.save(html_output)

corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93
r^2 correlation human vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs HEK (min_times_seen=2): 0.93
r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs human (min_times_seen=2): 0.94
r^2 correlation mastomys vs human (min_times_seen=2): 0.89
Saving to results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html
Out[6]:

Recreate same plot as above but reduce font sizes for a figure in a manuscript

In [7]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")

r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")

slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)

# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="effect on cell entry measured in 293T cells", 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T"
    ],
).properties(
    width=115,
    height=115
)

mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_mastomys",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    alt.Y(
        "effect_HEK293T",
        axis=alt.Axis(
            title="effect on cell entry measured in 293T cells", 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys",
        "effect_HEK293T",
        "times_seen_HEK293T",
        "n_selections_HEK293T",
    ],
).properties(
    width=115,
    height=115
)

mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
    alt.X(
        "effect_human",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+humanDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    alt.Y(
        "effect_mastomys",
        axis=alt.Axis(
            title=["effect on cell entry measured in", "293T\u0394DAG1+mastomysDAG1 cells"], 
            values=[-5,-4, -3, -2, -1, 0, 1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-5.5,1.5])
    ),
    opacity=alt.condition(
        alt.datum.average_times_seen < selector,
        alt.value(0),
        alt.value(0.1)
    ),
    tooltip=[
        "site",
        "wildtype",
        "mutant",
        "effect_human",
        "times_seen_human",
        "n_selections_human",
        "effect_mastomys",
        "times_seen_mastomys",
        "n_selections_mastomys"
    ],
).properties(
    width=115,
    height=115
)

corr_chart = alt.hconcat(
    human_vs_hek,
    mastomys_vs_hek,
    mastomys_vs_human, 
    spacing=5
).add_params(
   selector
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_point(
    size=10
)

corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93
r^2 correlation human vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs HEK (min_times_seen=2): 0.93
r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87
r correlation mastomys vs human (min_times_seen=2): 0.94
r^2 correlation mastomys vs human (min_times_seen=2): 0.89
Out[7]: